library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.0 ✔ purrr 0.3.2
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 0.8.3 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ──────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(skimr)
##
## Attaching package: 'skimr'
## The following object is masked from 'package:stats':
##
## filter
library(maps)
##
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
##
## map
library(gganimate)
library(ggpubr)
## Loading required package: magrittr
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
theme_set(theme_classic(18) +
theme(legend.position = "bottom"))
raw_data = read_csv("bird_impacts.csv") %>%
janitor::clean_names() %>%
dplyr::mutate(incident_month = factor(incident_month),
incident_year = factor(incident_year),
month = lubridate::month(incident_date, label = TRUE, abbr = TRUE))
## Parsed with column specification:
## cols(
## .default = col_character(),
## incident_date = col_datetime(format = ""),
## num_engs = col_double(),
## incident_month = col_double(),
## incident_year = col_double(),
## time = col_double(),
## height = col_double(),
## speed = col_double(),
## cost_repairs_infl_adj = col_double()
## )
## See spec(...) for full column specifications.
raw_data %>%
skimr::skim()
## Skim summary statistics
## n obs: 56978
## n variables: 22
##
## ── Variable type:character ───────────────────────────────────────────────────────────
## variable missing complete n min max empty n_unique
## airport 0 56978 56978 4 53 0 423
## airport_id 0 56978 56978 3 5 0 423
## atype 0 56978 56978 4 18 0 91
## damage 4324 52654 56978 1 2 0 4
## operator 0 56978 56978 15 18 0 4
## phase_of_flt 11469 45509 56978 4 12 0 24
## precip 21614 35364 56978 3 10 0 7
## sky 20485 36493 56978 8 10 0 3
## species 0 56978 56978 4 50 0 527
## species_id 0 56978 56978 1 6 0 528
## state 0 56978 56978 2 3 0 59
## time_of_day 16133 40845 56978 3 5 0 4
## type_eng 234 56744 56978 1 1 0 4
##
## ── Variable type:factor ──────────────────────────────────────────────────────────────
## variable missing complete n n_unique
## incident_month 0 56978 56978 12
## incident_year 0 56978 56978 29
## month 0 56978 56978 12
## top_counts ordered
## 9: 7980, 10: 7754, 8: 7104, 5: 6161 FALSE
## 201: 4452, 201: 4081, 201: 3764, 201: 3602 FALSE
## Sep: 7980, Oct: 7754, Aug: 7104, May: 6161 TRUE
##
## ── Variable type:numeric ─────────────────────────────────────────────────────────────
## variable missing complete n mean sd p0 p25
## cost_repairs_infl_adj 56363 615 56978 242387.9 942601.48 11 5128
## height 18038 38940 56978 983.83 2026.76 0 0
## num_engs 233 56745 56978 2.06 0.26 1 2
## speed 30046 26932 56978 154.55 42.28 0 130
## time 26124 30854 56978 1427.55 581.73 -84 930
## p50 p75 p100 hist
## 26783 93123.5 1.6e+07 ▇▁▁▁▁▁▁▁
## 50 1000 25000 ▇▁▁▁▁▁▁▁
## 2 2 4 ▁▁▇▁▁▁▁▁
## 140 170 354 ▁▁▅▇▂▂▁▁
## 1426 1950 2359 ▁▁▆▇▅▆▇▇
##
## ── Variable type:POSIXct ─────────────────────────────────────────────────────────────
## variable missing complete n min max median
## incident_date 0 56978 56978 1990-01-01 2018-12-31 2009-11-03
## n_unique
## 9880
This data has a large number of missing data in some fields. It seems like we should consider analysing individual variables
We put forward the hypothesis that some months has more impacts due to weather and bird behaviours.
We will also visualise the number of incidence per year per month to detect any report-time bias in the data.
month_summ = raw_data %>%
dplyr::group_by(incident_year, month) %>%
dplyr::count()
month_trace_plot = ggplot(month_summ,
aes(x = month,
y = n,
colour = incident_year,
group = incident_year)) +
geom_line() +
labs(title = "Number of birds impacts per year per month",
y = "Number of incidents") +
theme(legend.title = element_blank()) +
guides(colour = guide_legend(nrow = 3, byrow = TRUE))
month_trace_plot
These numbers should be scaled by the number of flights in the US every year. This analysis will be delayed.
It should be clear that the number of impacts are lower in the winter (Dec to Feb) and both spring and autumn has a higher number of impacts. It is interesting that there is a dip in June every year. This could be caused by some sampling bias or somthing about bird behaviours.
state_data = tibble::tibble(
state_abb = datasets::state.abb,
state_name = tolower(datasets::state.name),
state_division = datasets::state.division
)
#########################
state_summ = raw_data %>%
dplyr::filter(incident_year == 2018) %>%
dplyr::mutate(state = factor(state)) %>%
dplyr::group_by(month, state, .drop = FALSE) %>%
dplyr::tally() %>%
ungroup() %>%
left_join(state_data, by = c("state" = "state_abb"))
## Warning: Column `state`/`state_abb` joining factor and character vector,
## coercing into character vector
state_summ
## # A tibble: 648 x 5
## month state n state_name state_division
## <ord> <chr> <int> <chr> <fct>
## 1 Jan AL 1 alabama East South Central
## 2 Jan AR 0 arkansas West South Central
## 3 Jan AZ 1 arizona Mountain
## 4 Jan BC 0 <NA> <NA>
## 5 Jan CA 21 california Pacific
## 6 Jan CO 3 colorado Mountain
## 7 Jan CT 0 connecticut New England
## 8 Jan DC 3 <NA> <NA>
## 9 Jan FL 9 florida South Atlantic
## 10 Jan GA 3 georgia South Atlantic
## # … with 638 more rows
summary(state_summ$n)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 2.00 6.87 6.00 246.00
#################
us_states <- map_data("state") %>%
left_join(state_summ, by = c("region" = "state_name")) %>%
dplyr::filter(complete.cases(month))
ggplot(data = us_states,
mapping = aes(x = long, y = lat,
group = group)) +
geom_polygon(aes(fill = n), colour = "black") +
facet_wrap(~month) +
scale_fill_distiller(palette = "Reds", direction = 1) +
labs(title = "No. of bird impacts in 2018") +
theme_void(18) +
theme(legend.position = "bottom")
gif = ggplot(data = us_states,
mapping = aes(x = long, y = lat,
group = group)) +
geom_polygon(aes(fill = n), colour = "black") +
scale_fill_distiller(palette = "Reds", direction = 1) +
theme_void(18) +
theme(legend.position = "bottom") +
transition_states(month,
transition_length = 0.1,
state_length = 0.5) +
ease_aes('linear') +
labs(title = "No. of bird impacts in 2018",
subtitle = "Month: {closest_state}")
anim_save(filename = "incidents_2018.gif",
animation = gif, width = 800)
time_summ = raw_data %>%
dplyr::filter(complete.cases(time_of_day)) %>%
dplyr::group_by(incident_year, month, time_of_day) %>%
dplyr::count()
time_trace_plot = ggplot(time_summ,
aes(x = month,
y = n,
colour = incident_year,
group = interaction(incident_year, time_of_day))) +
geom_line() +
facet_grid(~ time_of_day) +
labs(title = "Number of birds incidents during the time of day",
y = "Number of incidents") +
theme(legend.title = element_blank(),
axis.text.x = element_text(angle = 90),
legend.position = "none")
time_trace_plot
Interestingly, the dipping effect in June was only visible at night times.
combine_trace_plot = cowplot::plot_grid(
month_trace_plot,
time_trace_plot,
nrow = 2,
rel_heights = c(2, 1))
combine_trace_plot
ggsave(plot = combine_trace_plot,
filename = "combine_trace_plot.png",
width = 12, height = 8)
sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_AU.UTF-8/en_AU.UTF-8/en_AU.UTF-8/C/en_AU.UTF-8/en_AU.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ggpubr_0.2.1 magrittr_1.5 gganimate_1.0.3 maps_3.3.0
## [5] skimr_1.0.7 janitor_1.2.0 forcats_0.4.0 stringr_1.4.0
## [9] dplyr_0.8.3 purrr_0.3.2 readr_1.3.1 tidyr_0.8.3
## [13] tibble_2.1.3 ggplot2_3.2.0 tidyverse_1.2.1
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.1 lubridate_1.7.4 lattice_0.20-38
## [4] class_7.3-15 prettyunits_1.0.2 png_0.1-7
## [7] assertthat_0.2.1 zeallot_0.1.0 digest_0.6.20
## [10] utf8_1.1.4 R6_2.4.0 cellranger_1.1.0
## [13] plyr_1.8.4 backports_1.1.4 e1071_1.7-2
## [16] evaluate_0.14 httr_1.4.0 pillar_1.4.2
## [19] rlang_0.4.0 progress_1.2.2 lazyeval_0.2.2
## [22] readxl_1.3.1 rstudioapi_0.10 gifski_0.8.6
## [25] rmarkdown_1.13 labeling_0.3 munsell_0.5.0
## [28] broom_0.5.2 compiler_3.6.0 modelr_0.1.4
## [31] xfun_0.8 pkgconfig_2.0.2 htmltools_0.3.6
## [34] tidyselect_0.2.5 lpSolve_5.6.13.1 fansi_0.4.0
## [37] sf_0.7-6 crayon_1.3.4 withr_2.1.2
## [40] grid_3.6.0 DBI_1.0.0 nlme_3.1-140
## [43] jsonlite_1.6 gtable_0.3.0 units_0.6-3
## [46] scales_1.0.0 KernSmooth_2.23-15 cli_1.1.0
## [49] stringi_1.4.3 reshape2_1.4.3 farver_1.1.0
## [52] ggsignif_0.5.0 snakecase_0.11.0 xml2_1.2.0
## [55] vctrs_0.2.0 generics_0.0.2 cowplot_1.0.0
## [58] transformr_0.1.1 RColorBrewer_1.1-2 tools_3.6.0
## [61] glue_1.3.1 tweenr_1.0.1 hms_0.5.0
## [64] yaml_2.2.0 colorspace_1.4-1 classInt_0.3-3
## [67] rvest_0.3.4 knitr_1.23 haven_2.1.1